This project investigates housing affordability and growth trends across U.S. metropolitan areas, with the goal of identifying regions where housing is becoming increasingly accessible or strained. Using publicly available datasets from the American Community Survey (ACS), the U.S. Census Bureau’s building permits records, and the Bureau of Labor Statistics (BLS), we integrate household income, rent, population, employment, and industry data to construct comprehensive indices of housing affordability and growth.
Data Acquisition
Household Demographic and Economic Indicators from American Community Survey (ACS)
Show code
if(!dir.exists(file.path("data", "mp02"))){dir.create(file.path("data", "mp02"), showWarnings=FALSE, recursive=TRUE)}library <-function(pkg){## Mask base::library() to automatically install packages if needed## Masking is important here so downlit picks up packages and links## to documentation pkg <-as.character(substitute(pkg))options(repos =c(CRAN ="https://cloud.r-project.org"))if(!require(pkg, character.only=TRUE, quietly=TRUE)) install.packages(pkg)stopifnot(require(pkg, character.only=TRUE, quietly=TRUE))}library(tidyverse)library(glue)library(readxl)library(tidycensus)get_acs_all_years <-function(variable, geography="cbsa",start_year=2009, end_year=2023){ fname <-glue("{variable}_{geography}_{start_year}_{end_year}.csv") fname <-file.path("data", "mp02", fname)if(!file.exists(fname)){ YEARS <-seq(start_year, end_year) YEARS <- YEARS[YEARS !=2020] # Drop 2020 - No survey (covid) ALL_DATA <-map(YEARS, function(yy){ tidycensus::get_acs(geography, variable, year=yy, survey="acs1") |>mutate(year=yy) |>select(-moe, -variable) |>rename(!!variable := estimate) }) |>bind_rows()write_csv(ALL_DATA, fname) }read_csv(fname, show_col_types=FALSE)}# Household income (12 month)INCOME <-get_acs_all_years("B19013_001") |>rename(household_income = B19013_001)# Monthly rentRENT <-get_acs_all_years("B25064_001") |>rename(monthly_rent = B25064_001)# Total populationPOPULATION <-get_acs_all_years("B01003_001") |>rename(population = B01003_001)# Total number of householdsHOUSEHOLDS <-get_acs_all_years("B11001_001") |>rename(households = B11001_001)
library(httr2)library(rvest)get_bls_industry_codes <-function(){ fname <-file.path("data", "mp02", "bls_industry_codes.csv")library(dplyr)library(tidyr)library(readr)if(!file.exists(fname)){ resp <-request("https://www.bls.gov") |>req_url_path("cew", "classifications", "industry", "industry-titles.htm") |>req_headers(`User-Agent`="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:143.0) Gecko/20100101 Firefox/143.0") |>req_error(is_error = \(resp) FALSE) |>req_perform()resp_check_status(resp) naics_table <-resp_body_html(resp) |>html_element("#naics_titles") |>html_table() |>mutate(title =str_trim(str_remove(str_remove(`Industry Title`, Code), "NAICS"))) |>select(-`Industry Title`) |>mutate(depth =if_else(nchar(Code) <=5, nchar(Code) -1, NA)) |>filter(!is.na(depth))# These were looked up manually on bls.gov after finding # they were presented as ranges. Since there are only three# it was easier to manually handle than to special-case everything else naics_missing <- tibble::tribble(~Code, ~title, ~depth, "31", "Manufacturing", 1,"32", "Manufacturing", 1,"33", "Manufacturing", 1,"44", "Retail", 1, "45", "Retail", 1,"48", "Transportation and Warehousing", 1, "49", "Transportation and Warehousing", 1 ) naics_table <-bind_rows(naics_table, naics_missing) naics_table <- naics_table |>filter(depth ==4) |>rename(level4_title=title) |>mutate(level1_code =str_sub(Code, end=2), level2_code =str_sub(Code, end=3), level3_code =str_sub(Code, end=4)) |>left_join(naics_table, join_by(level1_code == Code)) |>rename(level1_title=title) |>left_join(naics_table, join_by(level2_code == Code)) |>rename(level2_title=title) |>left_join(naics_table, join_by(level3_code == Code)) |>rename(level3_title=title) |>select(-starts_with("depth")) |>rename(level4_code = Code) |>select(level1_title, level2_title, level3_title, level4_title, level1_code, level2_code, level3_code, level4_code) |>drop_na() |>mutate(across(contains("code"), as.integer))write_csv(naics_table, fname) }read_csv(fname, show_col_types=FALSE)}INDUSTRY_CODES <-get_bls_industry_codes()
BLS Quarterly Census of Employment and Wages
Show code
library(httr2)library(rvest)get_bls_qcew_annual_averages <-function(start_year=2009, end_year=2023){ fname <-glue("bls_qcew_{start_year}_{end_year}.csv.gz") fname <-file.path("data", "mp02", fname) YEARS <-seq(start_year, end_year) YEARS <- YEARS[YEARS !=2020] # Drop Covid year to match ACSif(!file.exists(fname)){ ALL_DATA <-map(YEARS, .progress=TRUE, possibly(function(yy){ fname_inner <-file.path("data", "mp02", glue("{yy}_qcew_annual_singlefile.zip"))if(!file.exists(fname_inner)){request("https://www.bls.gov") |>req_url_path("cew", "data", "files", yy, "csv",glue("{yy}_annual_singlefile.zip")) |>req_headers(`User-Agent`="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:143.0) Gecko/20100101 Firefox/143.0") |>req_retry(max_tries=5) |>req_perform(fname_inner) }if(file.info(fname_inner)$size <755e5){warning(sQuote(fname_inner), "appears corrupted. Please delete and retry this step.") }read_csv(fname_inner, show_col_types=FALSE) |>mutate(YEAR = yy) |>select(area_fips, industry_code, annual_avg_emplvl, total_annual_wages, YEAR) |>filter(nchar(industry_code) <=5, str_starts(area_fips, "C")) |>filter(str_detect(industry_code, "-", negate=TRUE)) |>mutate(FIPS = area_fips, INDUSTRY =as.integer(industry_code), EMPLOYMENT =as.integer(annual_avg_emplvl), TOTAL_WAGES = total_annual_wages) |>select(-area_fips, -industry_code, -annual_avg_emplvl, -total_annual_wages) |># 10 is a special value: "all industries" , so omitfilter(INDUSTRY !=10) |>mutate(AVG_WAGE = TOTAL_WAGES / EMPLOYMENT) })) |>bind_rows()write_csv(ALL_DATA, fname) } ALL_DATA <-read_csv(fname, show_col_types=FALSE) ALL_DATA_YEARS <-unique(ALL_DATA$YEAR) YEARS_DIFF <-setdiff(YEARS, ALL_DATA_YEARS)if(length(YEARS_DIFF) >0){stop("Download failed for the following years: ", YEARS_DIFF, ". Please delete intermediate files and try again.") } ALL_DATA}WAGES <-get_bls_qcew_annual_averages()
Data Integration and Initial Exploration
Multi-Table Questions
1. Which CBSA (by name) permitted the largest number of new housing units in the decade from 2010 to 2019 (inclusive)?
Show code
# CBSA by name permitted largest number of new housing unit 2010-2019top_CBSA <- PERMITS |>filter(between(year, 2010, 2019)) |>group_by(CBSA) |>summarize(total_permits =sum(new_housing_units_permitted, na.rm =TRUE), .groups ="drop") |>arrange(desc(total_permits)) |>left_join(INCOME |>select(GEOID, NAME) |>distinct(), join_by(CBSA == GEOID)) |>select(CBSA, NAME, total_permits)# Create an interactive datatablelibrary(DT)datatable(top_CBSA,colname =c("CSBA","CSBA by Name", "Total Permits"),caption ="CBSA by name permitted largest number of new housing unit 2010-2019")
The CSBA (by name) that permitted the largest number of new housing units from 2010 to 2019 is CSBA code 26420 known as the Houston-Sugar Land-Baytown, TX Metro Area, at 482,075 permits.
2. In what year did Albuquerque, NM (CBSA Number 10740) permit the most new housing units?
Show code
# Filter and summarize Albuqurque, NM permit datanm_permit <- PERMITS |>filter(CBSA ==10740) |>group_by(year) |>summarize(total_permits =sum(new_housing_units_permitted, na.rm =TRUE), .groups ="drop") |>arrange(desc(total_permits))# Create an interactive datatabledatatable(nm_permit,colname =c("Year", "Total Permits"),caption ="New Housing Units Permitted in Albuquerque, NM (CBSA 10740)")
Show code
# Identify the top yeartop_nm_permit <- nm_permit |>slice_max(total_permits, n =1)
Albuquerque, NM (CBSA number 10740) permitted the most new housing units in 2021, with 4021 units.
3. Which state (not CBSA) had the highest average individual income in 2015?
Show code
library(dplyr)library(stringr)library(tibble)library(scales)library(DT)# extracting state abbreviation from CBSA nameextract_state <-function(name) {str_extract(name, "(?<=, )[A-Z]{2}")}# define state abbreviation to full state namestate_df <-data.frame(abb =c(state.abb, "DC", "PR"),name =c(state.name, "District of Columbia", "Puerto Rico"))# total income in 2015base_total <- INCOME |>filter(year ==2015) |>left_join(HOUSEHOLDS |>filter(year ==2015), by =c("GEOID", "NAME", "year")) |>left_join(POPULATION |>filter(year ==2015), by =c("GEOID", "NAME", "year")) |>mutate(state =extract_state(NAME),total_income = household_income * households )# aggregate to state level to compute average individual incomehigh_income <- base_total |>group_by(state) |>summarise(total_income =sum(total_income, na.rm =TRUE),total_population =sum(population, na.rm =TRUE),.groups ="drop" ) |>mutate(avg_income = total_income / total_population) |>arrange(desc(avg_income)) |>left_join(state_df, by =c("state"="abb")) |>mutate(total_income = scales::dollar(total_income, accuracy =1),avg_income = scales::dollar(avg_income, accuracy =1) ) |>select(name, state, total_population, total_income, avg_income)# Create an interactive datatabledatatable(high_income,colname =c("State Name", "State Abbreviation", "Total Population", "Total Income","Average Income"),caption ="Average and Total Income by State (2015)")
Show code
# Identify the top statetop_income <- base_total |>group_by(state) |>summarise(total_income =sum(total_income, na.rm =TRUE),total_population =sum(population, na.rm =TRUE),.groups ="drop" ) |>mutate(avg_income = total_income / total_population) |>slice_max(avg_income, n =1)
In 2015, the state with highest individual income was DC with an average income of $33,233.
4. What is the last year in which the NYC CBSA had the most data scientists in the country?
5. What fraction of total wages in the NYC CBSA was earned by people employed in the finance and insurance industries (NAICS code 52)? In what year did this fraction peak?
Initial Visualizations
1. The relationship between monthly rent and average household income per CBSA in 2009.
Show code
library(dplyr)library(ggplot2)library(scales)# Merge rent and income data for 2009rent_income_2009 <- RENT |>filter(year ==2009) |>left_join(INCOME |>filter(year ==2009), by =c("GEOID", "NAME", "year")) |>drop_na(monthly_rent, household_income)# Create scatterplotggplot(rent_income_2009, aes(x = household_income, y = monthly_rent)) +geom_point(alpha =0.6, size =2, color ="lightgreen") +geom_smooth(method ="lm", se =TRUE, color ="darkgreen", linewidth =0.8) +scale_x_continuous(labels =label_dollar()) +scale_y_continuous(labels =label_dollar()) +labs(title ="Relationship between Monthly Rent and Average Household Income per CBSA (2009)",x ="Average Household Income (USD)",y ="Average Monthly Rent (USD)" ) +theme_minimal(base_size =10)
2. The relationship between total employment and total employment in the health care and social services sector (NAICS 62) across different CBSAs.
Show code
library(dplyr)library(ggplot2)# Filter relevant industries and summarize by year + CBSAhealth_employment <- WAGES |>filter(INDUSTRY ==62) |>group_by(FIPS, YEAR) |>summarise(health_emp =sum(EMPLOYMENT, na.rm =TRUE), .groups ="drop")total_employment <- WAGES |>group_by(FIPS, YEAR) |>summarise(total_emp =sum(EMPLOYMENT, na.rm =TRUE), .groups ="drop")employment_joined <- total_employment |>left_join(health_employment, by =c("FIPS", "YEAR")) |>mutate(health_share = health_emp / total_emp)# Create scatterplotggplot(employment_joined, aes(x = total_emp, y = health_emp, color = YEAR)) +geom_point(alpha =0.6) +geom_smooth(method ="lm", se =FALSE, linewidth =0.9, color ="black") +scale_x_continuous(labels = scales::label_number(scale_cut = scales::cut_short_scale())) +scale_y_continuous(labels = scales::label_number(scale_cut = scales::cut_short_scale())) +scale_color_viridis_c(option ="plasma", end =0.9) +labs(title ="Health Care & Social Assistance Employment vs. Total Employment",subtitle ="Each point represents a CBSA by year",x ="Total Employment",y ="Health Care & Social Services Employment (NAICS 62)",color ="Year" ) +theme_minimal(base_size =13)
3. The evolution of average household size over time. Use different lines to represent different CBSAs.
Show code
library(dplyr)library(ggplot2)library(viridis)library(plotly)# Build base tablehousehold_base <- POPULATION |>select(GEOID, NAME, year, population) |>inner_join(HOUSEHOLDS |>select(GEOID, year, households),by =c("GEOID","year")) |>mutate(hh_size = population / households,NAME_short =str_remove(NAME, ",.*$")) |># collapse CBSAs with same root name (e.g. Atlanta variants)group_by(NAME_short, year) |>summarise(hh_size =mean(hh_size, na.rm =TRUE),population =sum(population, na.rm =TRUE),.groups ="drop" )# Find top 10 CBSAs by latest populationlatest_year <-max(household_base$year)top10 <- household_base |>filter(year == latest_year) |>slice_max(population, n =10) |>pull(NAME_short)household_top <- household_base |>filter(NAME_short %in% top10)# Create line plotplot <-ggplot(household_top, aes(x = year, y = hh_size, group = NAME_short, color = NAME_short)) +geom_line(linewidth =1) +geom_point(size =1.6) +scale_color_viridis_d(option ="D", end =0.9,guide =guide_legend(ncol =2, title =NULL)) +labs(title ="Average Household Size Over Time",subtitle ="Top 10 CBSAs by population",x ="Year", y ="Average Household Size" ) +theme_minimal(base_size =13) +theme(legend.position ="bottom")ggplotly(plot, tooltip =c("NAME_short", "year", "hh_size"))
Building Indices of Housing Affordability and Housing Stock
Rent Burden
Chicago Metro Area Rent Burden Change Over Time
For the Chicago Metro Area, the rent burden shows how housing costs relative to income have evolved over time. Years with higher rent burden values indicate that residents faced a greater share of income spent on rent, while lower values indicate more affordable housing. From 2019 to 2023, the Rent Burden Index has increased from 22.5 to 26.4. The interactive table highlights both Rent as % Income and the standardized 0–100 Rent Burden Index.
Show code
library(dplyr)library(scales)library(DT)# Join INCOME and RENT (use only GEOID and year)rent_income <- INCOME |>select(GEOID, NAME, year, household_income) |>inner_join( RENT |>select(GEOID, year, monthly_rent),by =c("GEOID", "year") )# Compute rent burdenrent_income <- rent_income |>mutate(rent_burden = (monthly_rent *12) / household_income,rent_index = scales::rescale(rent_burden, to =c(0, 100)) # Linear 0–100 )# Pick Chicago for my metro area of choicemetro_table <- rent_income |>filter(GEOID =="16980") |>arrange(year) |>mutate(rent_burden_pct = scales::percent(rent_burden, 0.1),rent_index =round(rent_index, 1) ) |>select(year, rent_burden_pct, rent_index)# Create interactive tabledatatable( metro_table,caption ="Rent Burden Over Time — Chicago Metro Area",colnames =c("Year", "Rent as % Income", "Rent Burden Index"),options =list(pageLength =10, autoWidth =TRUE))
Metro Areas With The Top 5 Highest and Lowest Rent Burden
As seen in the table below, the Metro Areas with the top 5 highest Rent Burden Index are all coastal cities/towns, ranging from 66.4 to 72.9. Meanwhile, the top 5 lowest are typically, not all, in land cities in less populated states, with Index ranging from 1.6 to 5.9.
Show code
library(DT)library(scales)library(dplyr)latest_year <-max(rent_income$year, na.rm =TRUE)rent_summary <- rent_income |>filter(year == latest_year) |>group_by(NAME) |>summarise(rent_burden =mean(rent_burden, na.rm =TRUE),rent_index =mean(rent_index, na.rm =TRUE),.groups ="drop" ) |>mutate(rent_burden_pct = scales::percent(rent_burden, 0.1),rent_index =round(rent_index, 1) )|>arrange(desc(rent_index))highest_burden <- rent_summary |>slice_max(rent_index, n =5)lowest_burden <- rent_summary |>slice_min(rent_index, n =5)burden_table <-bind_rows( highest_burden |>mutate(Category ="Highest Rent Burden"), lowest_burden |>mutate(Category ="Lowest Rent Burden")) |>select(NAME, Category, rent_burden_pct, rent_index)# Interactive table for top 5 and bottom 5 CBSAsdatatable( burden_table,caption =paste("CBSAs with Highest and Lowest Rent Burden —", latest_year),colnames =c("CBSA by Name", "Category","Rent as % Income","Rent Burden Index"),options =list(pageLength =10, autoWidth =TRUE))
Housing Growth
Show code
library(dplyr)library(scales)library(RcppRoll)library(DT)# Join POPULATION and PERMITShousing_data <- PERMITS |>inner_join( POPULATION |>select(GEOID, year, population),by =c("CBSA"="GEOID", "year") )# Compute 5-year population growthhousing_data <- housing_data |>group_by(CBSA) |>arrange(year) |>mutate(pop_lag5 =lag(population, 5),pop_growth_5yr = population - pop_lag5 ) |>ungroup() |>filter(!is.na(pop_growth_5yr))# Create instantaneous and rate-based housing growth metricshousing_data <- housing_data |>mutate(housing_growth_instant = new_housing_units_permitted / population *100,housing_growth_rate =ifelse(pop_growth_5yr >0, new_housing_units_permitted / pop_growth_5yr *100, NA) )# Standardize metrics (0-100)housing_data <- housing_data |>mutate(housing_growth_instant_index = scales::rescale(housing_growth_instant, to =c(0, 100)),housing_growth_rate_index = scales::rescale(housing_growth_rate, to =c(0, 100)) )# Composite index (average of the two metrics)housing_data <- housing_data |>mutate(composite_index = (housing_growth_instant_index + housing_growth_rate_index) /2 )# Summarize metrics by CBSA and join namescbsa_summary <- housing_data |>group_by(CBSA) |>summarise(instant_avg =mean(housing_growth_instant_index, na.rm =TRUE),rate_avg =mean(housing_growth_rate_index, na.rm =TRUE),composite_avg =mean(composite_index, na.rm =TRUE),.groups ="drop" ) |>left_join(POPULATION |>select(GEOID, NAME) |>distinct(),by =c("CBSA"="GEOID")) |>select(CBSA, NAME, instant_avg, rate_avg, composite_avg)# Step 7: Identify top and bottom CBSAstop_instant <- cbsa_summary |>slice_max(instant_avg, n =5)bottom_instant <- cbsa_summary |>slice_min(instant_avg, n =5)top_rate <- cbsa_summary |>slice_max(rate_avg, n =5)bottom_rate <- cbsa_summary |>slice_min(rate_avg, n =5)top_composite <- cbsa_summary |>slice_max(composite_avg, n =5)bottom_composite <- cbsa_summary |>slice_min(composite_avg, n =5)# Step 8: Interactive DT tables (example for composite)datatable( cbsa_summary |>arrange(desc(composite_avg)) |>mutate(instant_avg =round(instant_avg, 2),rate_avg =round(rate_avg, 2),composite_avg =round(composite_avg, 2) ),caption ="Housing Growth Metrics by CBSA (2014-2019)",options =list(pageLength =10, autoWidth =TRUE))